diff --git a/dev/bench/data.js b/dev/bench/data.js
index 90374865a933a..819f0de84d499 100644
--- a/dev/bench/data.js
+++ b/dev/bench/data.js
@@ -1,5 +1,5 @@
 window.BENCHMARK_DATA = {
-  "lastUpdate": 1717746526986,
+  "lastUpdate": 1717746533807,
   "repoUrl": "https://github.com/neuralmagic/nm-vllm",
   "entries": {
     "bigger_is_better": [
@@ -102395,820 +102395,6 @@ window.BENCHMARK_DATA = {
       }
     ],
     "observation_metrics": [
-      {
-        "commit": {
-          "author": {
-            "name": "Andy Linfoot",
-            "username": "andy-neuma",
-            "email": "78757007+andy-neuma@users.noreply.github.com"
-          },
-          "committer": {
-            "name": "GitHub",
-            "username": "web-flow",
-            "email": "noreply@github.com"
-          },
-          "id": "df1f1a00d1fb111ef035ac385fafa38b5ed34488",
-          "message": "switch to GCP based build VM (#201)\n\nSUMMARY:\r\n* switch over to GCP VM's for building stage of \"remote push\"\r\n\r\nNOTE: this is just the start. i'll redo the benchmarking and nightly\r\nworkflows in an upcoming PR.\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
-          "timestamp": "2024-04-23T14:46:41Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488"
-        },
-        "date": 1714210777058,
-        "tool": "customBiggerIsBetter",
-        "benches": [
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67721.31853129905,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136923.50532217047,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 427.07219489930145,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 864.3076764405669,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 160.639388213287,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 219.77572066720023,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24568.955616300445,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 42835.75935418953,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 227.82634990016953,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 374.61978443056046,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.055520334651355,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 48.596633468447926,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7527.377659600279,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13679.095036860263,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 191.89926309954896,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 331.35296574051563,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12.080104513171033,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14.650709110785206,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 8710.367588801275,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 15965.79030186061,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 275.3981603991636,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 477.1227850806277,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 20.97422668532524,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 38.98037290179124,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 77073.21296540032,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 146163.68240634102,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 3753.1767207005037,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7267.537872770735,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 255.0737451625434,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 383.5779543276513,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23023.83349860065,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 41645.743912039325,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 232.66971739867586,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 366.3132179202874,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 38.46510305387384,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 47.70392333508084,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 137370.26431799936,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 215869.07510454953,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61967.11279730116,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67496.72838633043,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 231.63840083851147,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 299.54306943821,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 25435.519305999707,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 43947.11136245994,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 259.40190170113067,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 395.49169726957183,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 82.69133798286272,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 128.12526599542636,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 111535.84533939959,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 139907.84255920982,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 86109.12421779885,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 110743.0007109996,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 79.65798954703632,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 152.7060449417187,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 168754.36456020072,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 274835.28300063947,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 55568.49748070054,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61393.967672872786,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 310.565785397624,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 368.4110025457586,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 20220.198251899634,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 35722.42199180989,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 177.03949560091132,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 298.8550787503116,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 56.43407178889857,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.53072934130422,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 32098.846679599596,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 56772.15222571119,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 333.93383340007864,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 559.5284918699325,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.18295931934536,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 115.03590451252023,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 22242.201575200306,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 38416.09949368026,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 405.60888209947734,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 633.2039915198765,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 54.510660354194954,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.7819932999009,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 52838.864513999964,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 107615.60263015,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 392.0379830997264,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 591.367701860072,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136.9365175885084,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 192.056543447482,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 20696.943696499173,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40126.07286110094,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 255.02403710015642,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 407.3263524987848,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 46.06854495174866,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61.6320865881175,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 20446.17628759952,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 41362.25675381941,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 249.30104090035456,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 437.84951206882346,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 44.60517419920066,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 68.91420816637299,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6740.6105944986875,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12953.2546778108,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 220.07543739982793,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 364.8997653608961,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14.575114716810681,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 29.404255282207586,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 108043.01666580034,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 152536.4371675998,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 73827.56902039946,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 78139.84387231978,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 117.39992733990071,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 153.14561219914486,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 8047.533504500825,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14416.730261489747,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 239.05779530032294,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 404.1848056998787,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13.527288472236323,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 16.885394328748717,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 10284.022520000146,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 17944.387033951258,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 290.2058996996857,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 507.1017831698554,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24.138703308314387,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 53.33755475741691,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 460924.56688270095,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 503368.02862478956,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 449794.0928350992,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 480406.76205557154,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 82.92364234558158,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 140.85796713146203,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12425.266776699209,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 25722.491432967527,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 315.02099159879435,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 500.7434328587614,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 28.132986743957666,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 47.44577373437636,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
-          }
-        ]
-      },
       {
         "commit": {
           "author": {
@@ -143094,6 +142280,820 @@ window.BENCHMARK_DATA = {
             "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
+      },
+      {
+        "commit": {
+          "author": {
+            "name": "Derek Kozikowski",
+            "username": "derekk-nm",
+            "email": "106621615+derekk-nm@users.noreply.github.com"
+          },
+          "committer": {
+            "name": "GitHub",
+            "username": "web-flow",
+            "email": "noreply@github.com"
+          },
+          "id": "87571b8be8105738d6da87df053d5a32e7fa001e",
+          "message": "add more models, new num_logprobs (#285)\n\nadding the `microsoft/phi-2`, `google/gemma-1.1-2b-it`, and\r\n`HuggingFaceH4/zephyr-7b-gemma-v0.1` models to\r\ntest_basic_server_correctness.py. this required increasing the number of\r\nlogprobs included in the evaluation to avoid unexpected failure for a\r\nfew prompts with these models. this did not negatively impact the other\r\nmodels.\r\n\r\nran the test locally multiple times.  each time we passed, like this:\r\n```\r\n/root/pyvenv/nmv3119a/bin/python3 /root/.local/share/JetBrains/IntelliJIdea2023.3/python/helpers/pycharm/_jb_pytest_runner.py --target test_basic_server_correctness.py::test_models_on_server -- --forked \r\nTesting started at 2:24 PM ...\r\nLaunching pytest with arguments --forked test_basic_server_correctness.py::test_models_on_server --no-header --no-summary -q in /network/derekk/testdev1/nm-vllm/tests/basic_correctness\r\n\r\n============================= test session starts ==============================\r\ncollecting ... collected 7 items\r\nRunning 7 items in this shard: tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None]\r\n\r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None] \r\n\r\n======================== 7 passed in 1332.51s (0:22:12) ========================\r\n```",
+          "timestamp": "2024-06-06T20:15:52Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e"
+        },
+        "date": 1717746532845,
+        "tool": "customBiggerIsBetter",
+        "benches": [
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24809.801940600075,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43256.87655343981,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 230.85963800004947,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 381.02006250007156,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.570137278423545,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.9601528261819,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143332.51583630027,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 222546.44138056054,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67864.02396499924,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73827.0890284312,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 239.8795270796644,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 308.6613617035208,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32526.437060700042,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57773.36524064003,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 330.9250161998762,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 545.5887492000373,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.53575750276477,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.19020295559704,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6915.1410166995465,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13380.373847579493,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 226.40810460006836,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 359.1773655598529,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.139491998436716,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29.79929538126497,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73576.49548339951,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148813.4775623996,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 425.19164659979657,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 836.7314811297817,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.70690829164758,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 229.70171741499104,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23364.73714039989,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41996.36488448967,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 233.36182179996285,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 350.5909243400219,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.84333733572024,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.08712341922941,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 466223.5411801002,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 509291.63272095995,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 454586.06324549974,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 486015.42612930026,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.72618933186322,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.12997684127475,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110610.14581850011,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155921.46428314017,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76759.02024500008,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81712.44376273022,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.14547809132924,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.16527273312516,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22691.409517400287,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39462.123903539956,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 406.51223129898426,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 626.6219724008079,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.431478742662875,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.34781941991966,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11215.382277100072,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19612.768866099803,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 298.9036618999308,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 517.92870615046,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26.038869194786315,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.07034680126974,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7184.081163700379,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13875.893549129794,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 230.8630073996028,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 366.7540930995157,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.690801239222802,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.531295486664455,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83522.69804000034,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 167674.00902064957,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 431.77151409990995,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 800.5986656801085,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.5041017158037,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 261.0510703761275,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12683.44704460087,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26111.0034482893,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 314.93118120051804,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 570.3491221899092,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 28.61592294155258,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.83555382492158,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 20670.799412599994,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41707.29727305004,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 249.81241279986102,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 434.6498393798992,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 44.87800991248256,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72.94319169829369,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 8218.9075702001,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14720.760121540156,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 200.3085763999479,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 341.1576650702042,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.05153270980138,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.80131226168232,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25392.581709299528,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43627.436878489585,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245.67273059983563,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 403.4982441999091,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.75461039469934,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.31011228923826,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 20969.66754760013,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40483.44978941978,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256.0066148002534,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 405.88605057017014,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.767859650149546,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.807399191316264,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114838.39343790001,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142907.70232090994,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88618.44696559978,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113273.42560282987,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.62750102888478,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.33145765445408,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10743.527976099997,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18773.01793947992,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 292.5919326004078,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 492.5039500004457,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.79929943589153,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.07471698160512,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 75642.95190749933,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141949.51066711123,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5267.717306099259,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7155.850088419046,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 250.14415547355244,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 359.49732220760734,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19982.480930899692,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35210.715996600185,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 175.0473129985039,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 290.22134690003423,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.914559414430734,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.25539483251296,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7825.313705700045,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14168.347028499735,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 195.37728429986598,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 335.6715424597767,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.494404729219081,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.95690672861726,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+          }
+        ]
       }
     ]
   }